/********************************************************************** * SAS Program: macro studying missing pattern.sas * Created by: Qixuan Chen, Ph.D, (qc2138@columbia.edu) * Department of Biostatistics, Columbia University * Last modified on: August 12, 2010 * NOTE: The paper on this macro will be presented in the SAS Global Forum 2011 in Las Vegas on April 4-7, 2011, "Studying Missing Data Patterns Using a SAS Macro", coauthored with Theresa Schwartz and Naihua Duan. ***********************************************************************/ %macro missingPattern(datain=, varlist=, exclude=, missPattern1=, dataout1=, missPattern2=, dataout2=, missPattern3=, dataout3=, missPattern4=, dataout4=); data newData; set &datain; run; %if %length(&varlist) ne 0 %then %do; /*create subdata with the varibels in the "varlist" only if the length of "varlist" is not zero*/ %if &exclude='TRUE' %then %do; data newData; set newData; drop &varlist; run; %end; %else %do; data newData; set newData; keep &varlist; run; %end; %end; proc contents data=newData; /*output the variable list*/ ods output Variables=varlist; run; proc sql noprint; /*create a macro variable '&nvar' as the total number of variables*/ select count(*) into: nvar from varlist; quit; %let nvar = &nvar; proc sql noprint; /*create '&nvar' number of macro variables '&var1'-'&&var&nvar'*/ select Variable into: var1 -:var&nvar from varlist; select Type into: type1 - :type&nvar from varlist; quit; data temp; set newData; /*create missing indicators 'm_&&var&i'*/ %do i=1 %to &nvar; if ("&&type&i" = 'Num' and &&var&i= .) or ("&&type&i" = 'Char' and &&var&i=' ') then m_&&var&i = 1; else m_&&var&i = 0; %end; run; data temp; set temp; obs = _N_; run; /*It is necessary to run PROC MEANS when there is no numerical variables in "temp"*/ proc means data=temp n; /*create the list of missing patterns with counts*/ class m_&var1 -- m_&&var&nvar; ods output Summary=miss_pattern; run; data miss_pattern; set miss_pattern; keep m_&var1 -- m_&&var&nvar NObs; run; proc sql noprint; /*create a macro variable '&n' as the total sample size*/ select count(*) into: n from newData; quit; %put &n; %if &missPattern1='TRUE' %then %do; data &dataout1; set miss_pattern; missPattern_prop = Nobs*100/&n; keep m_&var1 -- m_&&var&nvar NObs missPattern_prop; run; %end; %if &missPattern2='TRUE' %then %do; data &dataout2; set _null_; format var $40.; run; /*to avoid truncation on "var"*/ %do i=1 %to &nvar; data temp2; set miss_pattern; keep m_&&var&i Nobs; run; proc sql noprint; create table A as select "&&var&i" as var, sum(Nobs*m_&&var&i) as num_miss from temp2; quit; data &dataout2; set &dataout2 A; %end; data &dataout2; set &dataout2; prop_miss = 100*(num_miss/&n); run; %end; %if &missPattern3='TRUE' %then %do; data &dataout3; set _null_; run; %do i=1 %to %eval(&nvar-1); %let int = %eval(&i+1); %do j=&int %to &nvar; proc freq data=miss_Pattern; tables m_&&var&i*m_&&var&j; weight Nobs; ods output CrossTabFreqs=count; run; data count; set count; if m_&&var&i = . or m_&&var&j = . then delete; index=compress(m_&&var&i)||compress(m_&&var&j); keep m_&&var&i m_&&var&j Percent index; run; proc transpose data=count out=B prefix=P; var Percent; ID index; run; data B; set B; var1="&&var&i"; var2="&&var&j"; drop _NAME_ _LABEL_; run; data &dataout3; set &dataout3 B; %end; %end; data &dataout3; retain var1 var2 P00 P01 P10 P11 prop_concordance; set &dataout3; if P00=. then P00=0; if P01=. then P01=0; if P10=. then P10=0; if P11=. then P11=0; prop_concordance = P00 + P11; run; proc sort data=&dataout3; by descending prop_concordance; run; %end; %if &missPattern4='TRUE' %then %do; proc means data=miss_pattern sum; var m_&var1 -- m_&&var&nvar; ods output Summary=temp4; run; proc transpose data=temp4 out=temp4; run; proc sql noprint; select count(*) into: ncomplete from temp4 where COL1=0; /*count the number of variables without any missing values*/ quit; data temp5; set miss_pattern; num_Var_missing = sum(of m_&var1--m_&&var&nvar); /*count the number of missing variables in each missing pattern*/ run; proc sql noprint; select max(num_Var_missing) into: nVarMiss from temp5; /*output the number of missing variables for the missing pattern with the largest number of missing variables*/ quit; %if &ncomplete = %eval(&nvar-&nVarMiss) and &ncomplete < &nvar and &ncomplete > 0 %then %do; %put The candidate missing pattern for unit nonresponse is saved in the data: "&dataout4"!; data &dataout4; set temp5; where num_Var_missing = &nVarMiss; run; %end; %else %put There are no unit nonresponses!; %end; proc datasets library=work; delete A B Newdata Temp Temp2 Temp3 Temp4 Temp5 Varlist count; quit; %mend missingPattern;